#############################################################
# Author: Mike Burnham, mlb6496@psu.edu
# Python: 3.11.5
# OS: Windows 10
#
# Notes: This script will replicate the confidence intervals
# used to plot figure 2. It will create three files when run:
# 1. bootstrap_ci.log, 2. mcc_matrix.csv, and 
# 3. mcc_matrix_notrump.csv
##############################################################

import pandas as pd
from scipy.stats import bootstrap
from sklearn.metrics import matthews_corrcoef
import logging

logging.basicConfig(level=logging.DEBUG, filename="bootstrap_ci.log", filemode="a+", format="%(asctime)-15s %(levelname)-8s %(message)s")
# MCC wrapper to pass to the bootstrap function
def mcc_function(sample1, sample2):
    mcc = matthews_corrcoef(sample1, sample2)
    return mcc

# set random state
random_state = 1

###################################
## CIs for docs that mention Trump
###################################

# import data
df = pd.read_csv('./trump_test_in_context.csv')
df = df[df['target_mention'] == 1]
df2 = pd.read_csv('./trump_test_nli.csv')
df2 = df2[df2['target_mention'] == 1]
df3 = pd.read_csv('./trump_twitter_supervised.csv')
df3 = df3[df3['target_mention'] == 1]

logging.info("##### CIs for documents that mention Trump #####")
# initialize empty lists to hold results
model = []
approach = []
mcc = []
upper = []
lower = []

# GPT-4 No Logit Bias
data1 = df['adjudicated_sup']
data2 = df['gpt4_nobias']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("GPT-4 No Logit Bias")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.extend(['GPT-4'])
approach.extend(['Standard'])
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# GPT-4 Logit Bias
data1 = df['adjudicated_sup']
data2 = df['gpt4_bias']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("GPT-4 Logit Bias")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('GPT-4')
approach.append('Logit Bias')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# GPT-4 Chain of Thought
data1 = df['adjudicated_sup']
data2 = df['gpt4_cot']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("GPT-4 Chain of Thought")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('GPT-4')
approach.append('Chain-of-Thought')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# GPT-3.5 No Logit Bias
data1 = df['adjudicated_sup']
data2 = df['gpt3_5_nobias']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("GPT-3.5 No Logit Bias")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('GPT-3.5')
approach.append('Standard')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# GPT-3.5 Logit Bias
data1 = df['adjudicated_sup']
data2 = df['gpt3_5_bias']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("GPT-3.5 Logit Bias")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('GPT-3.5')
approach.append('Logit Bias')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# GPT-3.5 Chain of Thought
data1 = df['adjudicated_sup']
data2 = df['gpt3_5_cot']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("GPT-3.5 Chain of Thought")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('GPT-3.5')
approach.append('Chain-of-Thought')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# Mistral 7B No Logit Bias
data1 = df['adjudicated_sup']
data2 = df['mistral_nobias']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("Mistral 7B No Logit Bias")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('Mistral')
approach.append('Standard')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# Mistral 7B Logit Bias
data1 = df['adjudicated_sup']
data2 = df['mistral_bias']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("Mistral 7B Logit Bias")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('Mistral')
approach.append('Logit Bias')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# Mistral 7B Chain of Thought
data1 = df['adjudicated_sup']
data2 = df['mistral_cot']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("Mistral 7B Chain of Thought")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('mistral')
approach.append('Chain-of-Thought')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# DeBERTa
data1 = df2['adjudicated_sup']
data2 = df2['gL']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("DeBERTa")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('DeBERTa')
approach.append('NLI')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# PoliBERTweet
data1 = df3['labels']
data2 = df3['polibert']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("PoliBERTweet")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('PoliBERTweet')
approach.append('Supervised')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# BERTweet
data1 = df3['labels']
data2 = df3['bertweet']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("BERTweet")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('BERTweet')
approach.append('Supervised')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# RoBERTa
data1 = df3['labels']
data2 = df3['roberta']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("RoBERTa")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('RoBERTa')
approach.append('Supervised')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# Export Results
# compile results into a dataframe
mcc = pd.DataFrame(list(zip(model, approach, mcc, lower, upper)), columns = ['model', 'approach', 'mcc', 'lower', 'upper'])
# export as csv
mcc.to_csv('./mcc_matrix.csv', index = False)


#########################################
## CIs for docs that do not mention Trump
#########################################

# import data
df = pd.read_csv('./trump_test_in_context.csv')
df2 = pd.read_csv('./trump_test_nli.csv')
df3 = pd.read_csv('./trump_twitter_supervised.csv')

df = df[df['target_mention'] == 0]
df2 = df2[df2['target_mention'] == 0]
df3 = df3[df3['target_mention'] == 0]

logging.info("##### CIs for documents that do not mention Trump #####")
model = []
approach = []
mcc = []
upper = []
lower = []

# GPT-4 No Logit Bias
data1 = df['adjudicated_sup']
data2 = df['gpt4_nobias']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("GPT-4 No Logit Bias")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.extend(['GPT-4'])
approach.extend(['Standard'])
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# GPT-4 Logit Bias
data1 = df['adjudicated_sup']
data2 = df['gpt4_bias']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("GPT-4 Logit Bias")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('GPT-4')
approach.append('Logit Bias')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# GPT-4 Chain of Thought
data1 = df['adjudicated_sup']
data2 = df['gpt4_cot']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("GPT-4 Chain of Thought")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('GPT-4')
approach.append('Chain-of-Thought')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# GPT-3.5 No Logit Bias
data1 = df['adjudicated_sup']
data2 = df['gpt3_5_nobias']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("GPT-3.5 No Logit Bias")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('GPT-3.5')
approach.append('Standard')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# GPT-3.5 Logit Bias
data1 = df['adjudicated_sup']
data2 = df['gpt3_5_bias']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("GPT-3.5 Logit Bias")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('GPT-3.5')
approach.append('Logit Bias')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# GPT-3.5 Chain of Thought
data1 = df['adjudicated_sup']
data2 = df['gpt3_5_cot']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("GPT-3.5 Chain of Thought")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('GPT-3.5')
approach.append('Chain-of-Thought')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# Mistral 7B No Logit Bias
data1 = df['adjudicated_sup']
data2 = df['mistral_nobias']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("Mistral 7B No Logit Bias")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('Mistral')
approach.append('Standard')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# Mistral 7B Logit Bias
data1 = df['adjudicated_sup']
data2 = df['mistral_bias']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("Mistral 7B Logit Bias")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('Mistral')
approach.append('Logit Bias')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# Mistral 7B Chain of Thought
data1 = df['adjudicated_sup']
data2 = df['mistral_cot']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("Mistral 7B Chain of Thought")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('mistral')
approach.append('Chain-of-Thought')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# DeBERTa
data1 = df2['adjudicated_sup']
data2 = df2['gL']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("DeBERTa")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('DeBERTa')
approach.append('NLI')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# PoliBERTweet
data1 = df3['labels']
data2 = df3['polibert']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("PoliBERTweet")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('PoliBERTweet')
approach.append('Supervised')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# BERTweet
data1 = df3['labels']
data2 = df3['bertweet']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("BERTweet")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('BERTweet')
approach.append('Supervised')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# RoBERTa
data1 = df3['labels']
data2 = df3['roberta']

data = (data1, data2)
bootstrap_ci = bootstrap(data, mcc_function, 
                         confidence_level=0.95, 
                         vectorized = False, 
                         paired = True,
                         random_state = random_state, 
                         method='BCa')
logging.info("RoBERTa")
logging.info("MCC: " + str(mcc_function(data1, data2)))
logging.info("CI: " + str(bootstrap_ci.confidence_interval))

model.append('RoBERTa')
approach.append('Supervised')
mcc.append(mcc_function(data1, data2))
lower.append(bootstrap_ci.confidence_interval[0])
upper.append(bootstrap_ci.confidence_interval[1])

# Export Results
mcc = pd.DataFrame(list(zip(model, approach, mcc, lower, upper)), columns = ['model', 'approach', 'mcc', 'lower', 'upper'])
mcc.to_csv('./mcc_matrix_notrump.csv', index = False)